From ea2a57cab95db313b26ee439d5f6da8f958fdaf9 Mon Sep 17 00:00:00 2001
From: Pedro Folloni Pesserl <fpesserl7@gmail.com>
Date: Sun, 26 Feb 2023 02:10:04 -0300
Subject: [PATCH] almost works

---
 tshell/tshell_p3.sh | 99 +++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 58 deletions(-)

diff --git a/tshell/tshell_p3.sh b/tshell/tshell_p3.sh
index fb3d7a1..eb3323f 100755
--- a/tshell/tshell_p3.sh
+++ b/tshell/tshell_p3.sh
@@ -37,66 +37,49 @@ CSV=$(realpath $CSV)
 
 # converter o arquivo xml.gz para o formato csv
 zcat $XML | \
-	xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \
-	# o sed acha os campos úteis -- PMID seguido de ArticleTitle seguido
-	# de Abstract, seguido ou não de MeshHeadingList -- e converte no
-	# formato csv. No caso de não haver MeshHeadingList para um dado artigo,
-	# cria um campo vazio (<).
-	sed -En '
-	/PMID/{
-		s/<PMID Version="//;
-		s/">//;
-		s/<\/PMID>//;
-		N;
+xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \
+# o sed acha os campos úteis -- PMID seguido de ArticleTitle seguido
+# de Abstract, seguido ou não de MeshHeadingList -- e converte no
+# formato csv. No caso de não haver MeshHeadingList para um dado artigo,
+# insere uma linha vazia.
+sed -En '
+:x; /<PMID/{
+	s/<PMID Version="//;
+	s/">//;
+	s/<\/PMID>/</;
+	x;
+	n;
 
-		/ArticleTitle/{
-			s/<ArticleTitle>/</;
-			s/<\/ArticleTitle>//;
-			N;
+	/<ArticleTitle>/{
+		s/<ArticleTitle>//;
+		s/ *<\/ArticleTitle>/</;
+		H;
+		n;
 
-			/Abstract/{
-				s/<Abstract> *<AbstractText>/</;
-				s/<\/AbstractText> *<\/Abstract>//;
-				p;
-				n;
-
-				/MeshHeadingList/{
-					s/ *<MeshHeading> *<Descriptor[^>]*>//g;
-					s/(<\/DescriptorName>|<\/QualifierName>) *(<Qualifier[^>]*>|<\/MeshHeading>)/, /g;
-					s/<MeshHeadingList>/</;
-					s/(, *|)<\/MeshHeadingList>//;
-					p;
-				};
-
-				/PMID/{
-					i <
-				}
-			}
-		}
-	}' >> $CSV
+		/<Abstract>/{
+			s/(<Abstract>|<AbstractText[^>]*>) *//g;
+			s/ *<\/AbstractText> *(|<CopyrightInformation>)/, /g;
+			s/(,|<\/CopyrightInformation>) *<\/Abstract>/</g;
+			s/(<sup>|<sub>)//g;
+			s/(<\/sup>|<\/sub>)/, /g;
+			H;
+			x;
+			p;
+			n;
 
-# converter para o formato csv
-# sed -i '
-# 	/PMID/{
-# 		s/<PMID Version="//;
-# 		s/">//;
-# 		s/<\/PMID>//;
-# 	}
-
-# 	/ArticleTitle/{
-# 		s/<ArticleTitle>/</;
-# 		s/<\/ArticleTitle>//;
-# 	}
-
-# 	/Abstract/{
-# 		s/<Abstract> *<AbstractText>/</;
-# 		s/<\/AbstractText> *<\/Abstract>//;
-# 	}
+			/<MeshHeadingList>/{
+				s/(<\/DescriptorName>|<\/QualifierName>) *(<Qualifier[^>]*>|<\/MeshHeading> *)/, /g;
+				s/(<MeshHeadingList> *|<MeshHeading> *<Descriptor[^>]*>|(, *|)<\/MeshHeadingList>)//g;
+				p;
+				d;
+			};
 
-# 	/MeshHeadingList/{
-# 		s/ *<MeshHeading> *<Descriptor[^>]*>//g;
-# 		s/\(<\/DescriptorName>\|<\/QualifierName>\) *\(<Qualifier[^>]*>\|<\/MeshHeading>\)/, /g;
-# 		s/<MeshHeadingList>/</;
-# 		s/\(, *\|\)<\/MeshHeadingList>//;
-# 	}' $CSV
+			i \
 
+			bx;
+		};
+		bx;
+	};
+	bx;
+}' | \
+awk 'BEGIN {RS=""} {gsub(/<\n/, "<", $0); print $0}' >> $CSV
-- 
GitLab